## Women Talking: Bringing the Environment into UK Parliamentary Debates 
## Data Analysis 

library(dplyr)
library(ggplot2)
library(tidyverse)
library(stargazer)
library(jtools)
library(ExPanDaR)
library(sandwich)
library(lmtest)
library(tseries)
library(foreign)
library(stringr)
library(quanteda)
library(ggplot2)
library(tidyverse)
library(foreign)
library(MASS)
library(zoo)
library(broom)
library(sjPlot)
library(ggplot2)
library(clubSandwich)
library(modelsummary)
library(car)

## NO NEED TO REPLICATE: Why are some YearsExp negative? It's because some MPs didn't serve consecutive terms and/or
## their districts changed. I manually changed the 'wrong' dates in the full dataset.
Data$YearsExp<-as.numeric(Data$YearsExp)### figuring out why negatives exist
mean(Data$YearsExp)
unique(Data$YearsExp)

Years<-subset(Data, YearsExp<0)
View(Years)
nrow(Years)
unique(Years$speakername)
keep<-c("mnis_id", "speakername", "year", "start_date", "gender")
ManualStartDate<-Years[,keep] ## 707,022 observations
ManualStartDate <- distinct(ManualStartDate)
View(ManualStartDate)
write.csv(ManualStartDate, "ManualStartDate.csv", row.names = FALSE, quote = TRUE) ## this is a table of all the people from the Hansard speech data
#I used this to then manually fix this in 2010FinalData.csv

### START HERE:

Data<-read.csv(".../2010FinalData.csv")
library(lubridate)
Data$date <- dmy(Data$date) ##2021-04-29
max_date <- max(Data$date, na.rm = TRUE)
print(max_date)
nrow(Data) ## 687,645 observations
Data$YearsExp<-Data$year-Data$start_date
unique(Data$YearsExp)
Data$YearsExp<-as.numeric(Data$YearsExp) 
mean(Data$YearsExp, na.rm = TRUE)

# Filter out rows where speakername is "Madam Deputy Speaker" or "The Second Deputy Chairman"
Data <- Data %>%
  filter(!(speakername %in% c("Madam Deputy Speaker", "The Second Deputy Chairman", "An hon. Member", 
                              "An hon. Member:", "Hon. Members", "Hon. Members:", "Members of the House of Commons", 
                              "Mr Deputy Speaker", "Mr Speaker", "Mr Speaker-Elect", "Mr. Speaker", 
                              "My Lords and Members of the House of Commons,", "	
My Lords and Members of the House of Commons.", "The Chairman", "The First Deputy Chairman", "The Queen", "The Temporary Chairman", 
                              "My Lords and Members of the House of Commons", 	
                              "Several hon. Members", "My Lords and Members of the House of Commons.")))
nrow(Data) ## 670,856
## Make parliamentary roles variable binary
Data$parly_role <- ifelse(Data$parly_role %in% c("Speaker", "Deputy Speaker"), 1, 0)

##Descriptive Statistics

## Total speeches
nrow(Data) ## 670,856

## Men vs women speakers 
speakers<- unique(Data$speakername)
length(speakers)

count_gen_f <- Data %>%
  filter(gender == "F") %>%
  distinct(speakername) %>%
  nrow()
count_gen_m <- Data %>%
  filter(gender == "M") %>%
  distinct(speakername) %>%
  nrow()

print(paste("Women Speakers", count_gen_f))
print(paste("Men Speakers", count_gen_m))

## Men vs women speeches given
Data <- Data %>%
  mutate(observation_number = row_number())

count_gen_f2 <- Data %>%
  filter(gender == "F") %>%
  distinct(observation_number) %>%
  nrow()
count_gen_m2 <- Data %>%
  filter(gender == "M") %>%
  distinct(observation_number) %>%
  nrow()
print(paste("Women Speakers:", count_gen_f2))
print(paste("Men Speakers:", count_gen_m2))

## Environmental Speech count
count_ones <- sum(Data$EnvDummy == 1)
count_zeros <- sum(Data$EnvDummy == 0)
print(count_ones)
print(count_zeros)

## Years of experience 
mean(Data$YearsExp)

## Env Debates count
count_ones2 <- sum(Data$DebateTopic == 1)
count_zeros2 <- sum(Data$DebateTopic == 0)
print(count_ones2)
print(count_zeros2)

## Env Committee membership
count_comittee_0 <- Data %>%
  filter(EnvCommittee == 0) %>%
  distinct(speakername) %>%
  nrow()
count_committee_1 <- Data %>%
  filter(EnvCommittee == 1) %>%
  distinct(speakername) %>%
  nrow()
print(paste("Number of non-Env Committee members:", count_comittee_0))
print(paste("Number Env Committee members:", count_committee_1))

## Gender of Env Committee Members
count_comitteegen_M <- Data %>%
  filter(EnvCommittee == 1) %>%
  filter(gender == "M") %>%
  distinct(speakername) %>%
  nrow()

count_comitteegen_F <- Data %>%
  filter(EnvCommittee == 1) %>%
  filter(gender == "F") %>%
  distinct(speakername) %>%
  nrow()
print(paste("Number of Men Env Committee members:", count_comitteegen_M))
print(paste("Number of Women Env Committee members:", count_comitteegen_F))


## Ruling Party membership
count_conserv_0 <- Data %>%
  filter(RulingParty == 0) %>%
  distinct(speakername) %>%
  nrow()
count_conserv_1 <- Data %>%
  filter(RulingParty == 1) %>%
  distinct(speakername) %>%
  nrow()
print(paste("Number of Non-Conservative:", count_conserv_0))
print(paste("Number of Conservatives:", count_conserv_1))

## Plot:  Total environmental Speeches per party
Party<-read.csv(".../PartyProportions.csv")
Party$party <- factor(Party$party, levels = c("Conservative", "Labour", "Liberal Democrat","Independent ", "Scottish National Party","Labour (Co-op)", "Democratic Unionist Party"))
ggplot(Party, aes(x = party, y = TotalEnvSpeeches)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), width = 0.7) +
  labs(title = "Total Environmental Speeches per Party", 
       x = "Party", 
       y = "Environmental Speeches")+
  theme_minimal()+
  theme(plot.title = element_text(hjust = 0.5))

## Tracking over time: % of env speeches by women - % of env speeches by men= gender gap. Plot it. 

##2010
WomEnvSpeech2010 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2010") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2010)

WomSpeeches2010 <- Data %>%
  filter(year==2010) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2010)

EnvWomen10<-(WomEnvSpeech2010/WomSpeeches2010)*100
print(EnvWomen10)

MenEnvSpeech2010<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2010") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2010)

MenSpeeches2010 <- Data %>%
  filter(year=="2010") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2010)

EnvMen10<-(MenEnvSpeech2010/MenSpeeches2010)*100
print(EnvMen10)

Gap2010<-EnvWomen10-EnvMen10
print(Gap2010)

## 2011
WomEnvSpeech2011 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2011") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2011)

WomSpeeches2011 <- Data %>%
  filter(year==2011) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2011)

EnvWomen11<-(WomEnvSpeech2011/WomSpeeches2011)*100
print(EnvWomen11)

MenEnvSpeech2011<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2011") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2011)

MenSpeeches2011 <- Data %>%
  filter(year=="2011") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2011)

EnvMen11<-(MenEnvSpeech2011/MenSpeeches2011)*100
print(EnvMen11)
Gap2011<-EnvWomen11-EnvMen11
print(Gap2011)

## 2012
WomEnvSpeech2012 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2012") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2012)

WomSpeeches2012 <- Data %>%
  filter(year==2012) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2012)

EnvWomen12<-(WomEnvSpeech2012/WomSpeeches2012)*100
print(EnvWomen12)

MenEnvSpeech2012<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2012") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2012)

MenSpeeches2012 <- Data %>%
  filter(year=="2012") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2012)

EnvMen12<-(MenEnvSpeech2012/MenSpeeches2012)*100
print(EnvMen12)
Gap2012<-EnvWomen12-EnvMen12
print(Gap2012)

## 2013
WomEnvSpeech2013 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2013") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2013)

WomSpeeches2013 <- Data %>%
  filter(year==2013) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2013)

EnvWomen13<-(WomEnvSpeech2013/WomSpeeches2013)*100
print(EnvWomen13)

MenEnvSpeech2013<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2013") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2013)

MenSpeeches2013 <- Data %>%
  filter(year=="2013") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2013)

EnvMen13<-(MenEnvSpeech2013/MenSpeeches2013)*100
print(EnvMen13)
Gap2013<-EnvWomen13-EnvMen13
print(Gap2013)

## 2014
WomEnvSpeech2014 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2014") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2014)

WomSpeeches2014 <- Data %>%
  filter(year==2014) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2014)

EnvWomen14<-(WomEnvSpeech2014/WomSpeeches2014)*100
print(EnvWomen14)

MenEnvSpeech2014<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2014") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2014)

MenSpeeches2014 <- Data %>%
  filter(year=="2014") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2014)

EnvMen14<-(MenEnvSpeech2014/MenSpeeches2014)*100
print(EnvMen14)
Gap2014<-EnvWomen14-EnvMen14
print(Gap2014)

## 2015

WomEnvSpeech2015 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2015") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2015)

WomSpeeches2015 <- Data %>%
  filter(year==2015) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2015)

EnvWomen15<-(WomEnvSpeech2015/WomSpeeches2015)*100
print(EnvWomen15)

MenEnvSpeech2015<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2015") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2015)

MenSpeeches2015 <- Data %>%
  filter(year=="2015") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2015)

EnvMen15<-(MenEnvSpeech2015/MenSpeeches2015)*100
print(EnvMen15)
Gap2015<-EnvWomen15-EnvMen15
print(Gap2015)

## 2016

WomEnvSpeech2016 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2016") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2016)

WomSpeeches2016 <- Data %>%
  filter(year==2016) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2016)

EnvWomen16<-(WomEnvSpeech2016/WomSpeeches2016)*100
print(EnvWomen16)

MenEnvSpeech2016<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2016") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2016)

MenSpeeches2016 <- Data %>%
  filter(year=="2016") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2016)

EnvMen16<-(MenEnvSpeech2016/MenSpeeches2016)*100
print(EnvMen16)
Gap2016<-EnvWomen16-EnvMen16
print(Gap2016)

## 2017

WomEnvSpeech2017 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2017") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2017)

WomSpeeches2017 <- Data %>%
  filter(year==2017) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2017)

EnvWomen17<-(WomEnvSpeech2017/WomSpeeches2017)*100
print(EnvWomen17)

MenEnvSpeech2017<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2017") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2017)

MenSpeeches2017 <- Data %>%
  filter(year=="2017") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2017)

EnvMen17<-(MenEnvSpeech2017/MenSpeeches2017)*100
print(EnvMen17)
Gap2017<-EnvWomen17-EnvMen17
print(Gap2017)

## 2018
WomEnvSpeech2018 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2018") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2018)

WomSpeeches2018 <- Data %>%
  filter(year==2018) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2018)

EnvWomen18<-(WomEnvSpeech2018/WomSpeeches2018)*100
print(EnvWomen18)

MenEnvSpeech2018<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2018") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2018)

MenSpeeches2018 <- Data %>%
  filter(year=="2018") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2018)

EnvMen18<-(MenEnvSpeech2018/MenSpeeches2018)*100
print(EnvMen18)
Gap2018<-EnvWomen18-EnvMen18
print(Gap2018)

## 2019

WomEnvSpeech2019 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2019") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2019)

WomSpeeches2019 <- Data %>%
  filter(year==2019) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2019)

EnvWomen19<-(WomEnvSpeech2019/WomSpeeches2019)*100
print(EnvWomen19)

MenEnvSpeech2019<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2019") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2019)

MenSpeeches2019 <- Data %>%
  filter(year=="2019") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2019)

EnvMen19<-(MenEnvSpeech2019/MenSpeeches2019)*100
print(EnvMen19)
Gap2019<-EnvWomen19-EnvMen19
print(Gap2019)

## 2020

WomEnvSpeech2020 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2020") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech2020)

WomSpeeches2020 <- Data %>%
  filter(year==2020) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches2020)

EnvWomen20<-(WomEnvSpeech2020/WomSpeeches2020)*100
print(EnvWomen20)

MenEnvSpeech2020<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2020") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech2020)

MenSpeeches2020 <- Data %>%
  filter(year=="2020") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches2020)

EnvMen20<-(MenEnvSpeech2020/MenSpeeches2020)*100
print(EnvMen20)
Gap2020<-EnvWomen20-EnvMen20
print(Gap2020)

## 2021

WomEnvSpeech21 <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2021") %>%
  filter(gender=="F") %>%
  nrow()
print(WomEnvSpeech21)

WomSpeeches21 <- Data %>%
  filter(year==2021) %>%
  filter(gender=="F") %>%
  nrow()
print(WomSpeeches21)

EnvWomen21<-(WomEnvSpeech21/WomSpeeches21)*100
print(EnvWomen21)

MenEnvSpeech21<-Data %>%
  filter(EnvDummy == "1") %>%
  filter(year=="2021") %>%
  filter(gender=="M")%>%
  nrow()
print(MenEnvSpeech21)

MenSpeeches21 <- Data %>%
  filter(year=="2021") %>%
  filter(gender=="M") %>%
  nrow()
print(MenSpeeches21)

EnvMen21<-(MenEnvSpeech21/MenSpeeches21)*100
print(EnvMen21)
Gap21<-EnvWomen21-EnvMen21
print(Gap21)

## Manually created an Excel csv with these values; read in here

GenderGap<-read.csv(".../GenderGap.csv")

# Create a line plot with points

##Gender Gap
ggplot(GenderGap, aes(x = Year, y =GenderGap)) +
  geom_line() +
  geom_point() +
  xlim(2010, 2021) +  
  scale_x_continuous(breaks = seq(min(GenderGap$Year), max(GenderGap$Year), by = 1))+
  ylim(-1, 3) +  
  geom_hline(yintercept = 0, color = "black", linetype = "solid", size = 0.9) +
  geom_smooth(method = "lm", se = FALSE, color = "grey", linetype="dashed", size=0.75) +
  labs(title = "Gender Gap in Speech Environmentalism Over Time", x = "Year", y = "Gender Gap (%)") +
  theme_minimal()

## Men Vs Women Proportion env speeches
ggplot(GenderGap, aes(x = Year)) +
  geom_line(aes(y = WomEnvSpeech, color = "Women's Env Speech")) +
  geom_point(aes(y = WomEnvSpeech, color = "Women's Env Speech")) +
  geom_line(aes(y = MenEnvSpeech, color = "Men's Env Speech")) +
  geom_point(aes(y = MenEnvSpeech, color = "Men's Env Speech")) +
  xlim(2010, 2021) +
  scale_x_continuous(breaks = seq(2010, 2021, by = 1)) +
  ylim(0, 9) +
  geom_hline(yintercept = 0, color = "black", linetype = "solid", size = 0.5) +
  scale_color_manual(values = c("Men's Env Speech" = "blue", "Women's Env Speech" = "Red"),  # Set colors
                     labels = c("Men's Env Speech" = "Men MPs", "Women's Env Speech" = "Women MPs")) +
  labs(title = "Proportion of Women and Men's Environmental Speeches: 2010-2021",
       x = "Year", y = "% Env. Speeches of Total Speeches",
       color = "Legend") +
  theme_minimal()

## By Gender
# Summing EnvDummy by year and gender, excluding the year 2021 and filtering out NA gender
yearly_env_sum <- Data %>%
  filter(year != 2021, !is.na(gender)) %>%  # Exclude 2021 and remove rows where gender is NA
  group_by(year, gender) %>%
  summarise(total_envdummy = sum(EnvDummy)) %>%
  ungroup()

# Creating total EnvDummy for each year (without gender)
total_env_sum <- Data %>%
  filter(year != 2021) %>%
  group_by(year) %>%
  summarise(total_envdummy = sum(EnvDummy)) %>%
  mutate(gender = "Total")  # Add a 'Total' label for consistency in plotting

# Combine gender-specific data with total data
combined_data <- bind_rows(yearly_env_sum, total_env_sum)

# Ensure 'year' is treated as an integer
combined_data$year <- as.integer(combined_data$year)

# Plot the data with separate lines for each gender and total
ggplot(combined_data, aes(x = year, y = total_envdummy, color = gender)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = combined_data$year) +  # Ensure only whole years appear on x-axis
  scale_color_manual(values = c("F" = "red", "M" = "blue", "Total" = "black"),  # Set colors
                     labels = c("F" = "Female", "M" = "Male", "Total" = "Total")) +  # Set labels
  labs(title = "Total Environmental Speeches per Year: 2010-2020",
       x = "Year", y = "Number of Environmental Speeches") +
  theme_minimal()


## REGRESSIONS

## Aggregating data
## Export two tables, make one of variables to be aggregated and another for the factors that I want
## to put back in the dataset.

### turn factors into characters to aggreate

Data$mnis_id<-as.character(Data$mnis_id)
Data$gender<-as.character(Data$gender)
Data$party<-as.character(Data$party)
Data$constituency<-as.character(Data$constituency)
Data$gvt_role<-as.character(Data$gvt_role)
Data$opp_role<-as.character(Data$opp_role)
Data$parly_role<-as.character(Data$parly_role)
Data$EnvCommittee<-as.character(Data$EnvCommittee)
Data$RulingParty<-as.character(Data$RulingParty)
Data$YearsExp<-as.character(Data$YearsExp)
Data$EnvDummy<-as.numeric(Data$EnvDummy)
Data$DebateTopic<-as.numeric(Data$DebateTopic)
Data$YearsExp<-as.numeric(Data$YearsExp)

Aggregate1 <- aggregate(EnvDummy ~ speakername + year, data = Data, FUN = sum)
Aggregate2 <- aggregate(DebateTopic ~ speakername + year, data = Data, FUN = sum)
Aggregate3<- aggregate(YearsExp ~ speakername + year, data = Data, FUN = mean) ##taking the average years of experience over the period
Aggregate4 <- Data %>%
  group_by(speakername, year) %>%
  summarise(
    count_0 = sum(EnvDummy == 0),
    env_speeches = sum(EnvDummy == 1),
    total_speeches = count_0 + env_speeches,
    EnvRatio = round(env_speeches / total_speeches, 2)
  ) %>%
  ungroup()
detach("package:MASS", unload=TRUE)
Aggregate4<-Aggregate4 %>%
  select(speakername, year, count_0, env_speeches, total_speeches, EnvRatio)
AggData<-merge(Aggregate1, Aggregate2, by=c("speakername", "year"))
AggData<-merge(AggData, Aggregate3,by=c("speakername", "year"))
AggData<-merge(AggData, Aggregate4, by=c("speakername", "year"))


Factors <- Data %>%
  select(speakername, year, mnis_id, gender, party, constituency, gvt_role, opp_role, parly_role, EnvCommittee, RulingParty) ## make sure that MASS isn't loaded-- dplyr instead
Factors<- distinct(Factors,  speakername, year, .keep_all = TRUE)

AggregatedData<-merge(AggData, Factors, by=c("speakername", "year"))
unique(AggregatedData$year)
nrow(AggregatedData)

## Testing if NBR is correct: 

# Fit a Poisson regression model
AggregatedData$gender<- as.factor(AggregatedData$gender)
AggregatedData$gender <- relevel(AggregatedData$gender, ref = "M")
poisson_model <- glm(EnvDummy~gender+YearsExp+gvt_role+opp_role+parly_role+ EnvCommittee+RulingParty+DebateTopic, data = AggregatedData, family = "poisson")

# Calculate residual deviance
residual_deviance <- sum(resid(poisson_model, type = "pearson")^2)

# Calculate dispersion parameter
df <- df.residual(poisson_model)
dispersion <- residual_deviance / df

# Check if dispersion parameter is significantly greater than 1
if (dispersion > 1) {
  print("Data is overdispersed.")
} else {
  print("Data is not overdispersed.")
}

### poisson model is not best for count data: data is over-dispersed. 

## M as reference 
AggregatedData$gender<-as.factor(AggregatedData$gender)
AggregatedData$gender <- relevel(AggregatedData$gender, ref = "M")
AggregatedData$EnvDummy<-as.integer(AggregatedData$EnvDummy)
AggregatedData$year<-as.factor(AggregatedData$year)
AggregatedData$DebateTopic<-as.numeric(AggregatedData$DebateTopic)
AggregatedData$YearsExp<-as.numeric(AggregatedData$YearsExp)
AggregatedData$mnis_id<-as.factor(AggregatedData$mnis_id)
AggregatedData$party<-as.factor(AggregatedData$party)
AggregatedData$constituency<-as.factor(AggregatedData$constituency)
AggregatedData$gvt_role<-as.numeric(AggregatedData$gvt_role)
AggregatedData$opp_role<-as.numeric(AggregatedData$opp_role)
AggregatedData$parly_role<-as.numeric(AggregatedData$parly_role)
AggregatedData$EnvCommittee<-as.numeric(AggregatedData$EnvCommittee)
AggregatedData$RulingParty<-as.numeric(AggregatedData$RulingParty)

## Data structure for rates of env speeches during environmental debates vs. non-environmental debates 
EnvDebates<-subset(Data, DebateTopic=="1")
NotEnvDebates<-subset(Data,DebateTopic=="0")
View(EnvDebates)

##Environmental Debates
EnvDebates$mnis_id<-as.character(EnvDebates$mnis_id)
EnvDebates$gender<-as.character(EnvDebates$gender)
EnvDebates$party<-as.character(EnvDebates$party)
EnvDebates$constituency<-as.character(EnvDebates$constituency)
EnvDebates$gvt_role<-as.character(EnvDebates$gvt_role)
EnvDebates$opp_role<-as.character(EnvDebates$opp_role)
EnvDebates$parly_role<-as.character(EnvDebates$parly_role)
EnvDebates$EnvCommittee<-as.character(EnvDebates$EnvCommittee)
EnvDebates$RulingParty<-as.character(EnvDebates$RulingParty)
EnvDebates$YearsExp<-as.character(EnvDebates$YearsExp)
EnvDebates$EnvDummy<-as.numeric(EnvDebates$EnvDummy)
EnvDebates$DebateTopic<-as.numeric(EnvDebates$DebateTopic)
EnvDebates$YearsExp<-as.numeric(EnvDebates$YearsExp)

EnvAggregate1 <- aggregate(EnvDummy ~ speakername + year, data = EnvDebates, FUN = sum)
EnvAggregate2 <- aggregate(DebateTopic ~ speakername + year, data = EnvDebates, FUN = sum)
EnvAggregate3<- aggregate(YearsExp ~ speakername + year, data = EnvDebates, FUN = mean) ##taking the average years of experience over the period
EnvAggregate4 <- EnvDebates %>%
  group_by(speakername, year) %>%
  summarise(
    count_0 = sum(EnvDummy == 0),
    env_speeches = sum(EnvDummy == 1),
    total_speeches = count_0 + env_speeches,
    EnvRatio = round(env_speeches / total_speeches, 2)
  ) %>%
  ungroup()
detach("package:MASS", unload=TRUE)
EnvAggregate4<-EnvAggregate4 %>%
  select(speakername, year, count_0, env_speeches, total_speeches, EnvRatio)
EnvAggData<-merge(EnvAggregate1, EnvAggregate2, by=c("speakername", "year"))
EnvAggData<-merge(EnvAggData, EnvAggregate3,by=c("speakername", "year"))
EnvAggData<-merge(EnvAggData, EnvAggregate4, by=c("speakername", "year"))

EnvFactors <- EnvDebates %>%
  select(speakername, year, mnis_id, gender, party, constituency, gvt_role, opp_role, parly_role, EnvCommittee, RulingParty) ## make sure that MASS isn't loaded-- dplyr instead
EnvFactors<- distinct(EnvFactors,  speakername, year, .keep_all = TRUE)

EnvAggregatedData<-merge(EnvAggData, EnvFactors, by=c("speakername", "year"))
unique(EnvAggregatedData$year)
nrow(EnvAggregatedData)
View(EnvAggregatedData)

EnvAggregatedData$gender<-as.factor(EnvAggregatedData$gender)
EnvAggregatedData$gender <- relevel(EnvAggregatedData$gender, ref = "M")

## NOT Env debates 
NotEnvDebates$mnis_id<-as.character(NotEnvDebates$mnis_id)
NotEnvDebates$gender<-as.character(NotEnvDebates$gender)
NotEnvDebates$party<-as.character(NotEnvDebates$party)
NotEnvDebates$constituency<-as.character(NotEnvDebates$constituency)
NotEnvDebates$gvt_role<-as.character(NotEnvDebates$gvt_role)
NotEnvDebates$opp_role<-as.character(NotEnvDebates$opp_role)
NotEnvDebates$parly_role<-as.character(NotEnvDebates$parly_role)
NotEnvDebates$EnvCommittee<-as.character(NotEnvDebates$EnvCommittee)
NotEnvDebates$RulingParty<-as.character(NotEnvDebates$RulingParty)
NotEnvDebates$YearsExp<-as.character(NotEnvDebates$YearsExp)
NotEnvDebates$EnvDummy<-as.numeric(NotEnvDebates$EnvDummy)
NotEnvDebates$DebateTopic<-as.numeric(NotEnvDebates$DebateTopic)
NotEnvDebates$YearsExp<-as.numeric(NotEnvDebates$YearsExp)

NotEnvAggregate1 <- aggregate(EnvDummy ~ speakername + year, data = NotEnvDebates, FUN = sum)
NotEnvAggregate2 <- aggregate(DebateTopic ~ speakername + year, data = NotEnvDebates, FUN = sum)
NotEnvAggregate3<- aggregate(YearsExp ~ speakername + year, data = NotEnvDebates, FUN = mean) ##taking the average years of experience over the period
NotEnvAggregate4 <- NotEnvDebates %>%
  group_by(speakername, year) %>%
  summarise(
    count_0 = sum(EnvDummy == 0),
    env_speeches = sum(EnvDummy == 1),
    total_speeches = count_0 + env_speeches,
    EnvRatio = round(env_speeches / total_speeches, 2)
  ) %>%
  ungroup()
detach("package:MASS", unload=TRUE)
NotEnvAggregate4<-NotEnvAggregate4 %>%
  select(speakername, year, count_0, env_speeches, total_speeches, EnvRatio)
NotEnvAggData<-merge(NotEnvAggregate1, NotEnvAggregate2, by=c("speakername", "year"))
NotEnvAggData<-merge(NotEnvAggData, NotEnvAggregate3,by=c("speakername", "year"))
NotEnvAggData<-merge(NotEnvAggData, NotEnvAggregate4, by=c("speakername", "year"))

NotEnvFactors <- NotEnvDebates %>%
  select(speakername, year, mnis_id, gender, party, constituency, gvt_role, opp_role, parly_role, EnvCommittee, RulingParty) ## make sure that MASS isn't loaded-- dplyr instead
NotEnvFactors<- distinct(NotEnvFactors,  speakername, year, .keep_all = TRUE)

NotEnvAggregatedData<-merge(NotEnvAggData, NotEnvFactors, by=c("speakername", "year"))
unique(NotEnvAggregatedData$year)
nrow(NotEnvAggregatedData)
View(NotEnvAggregatedData)

NotEnvAggregatedData$gender<-as.factor(NotEnvAggregatedData$gender)
NotEnvAggregatedData$gender <- relevel(NotEnvAggregatedData$gender, ref = "M")

library(MASS)

## Full sample: EnvDummy~gender 
AggregatedData$gender <- relevel(AggregatedData$gender, ref = "M")
LM2<-glm.nb(EnvDummy~gender+YearsExp+gvt_role+opp_role+ EnvCommittee+RulingParty+DebateTopic+as.factor(year), data = AggregatedData, control = glm.control(maxit = 100))

## Clustered Standard Errors
cluster_vcov2 <- vcovCR(LM2, cluster = AggregatedData$speakername, type = "CR0")

p_value <- with(LM2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0.071 = bigger than 0.05 = good.

## No values over 10, so no multicollinearity
vif(LM2)

## Comparing Models w/ and w/o gender
LM2_reduced <- update(LM2, . ~ . - gender)
anova(LM2_reduced, LM2, test = "Chisq")
#______________
## Full sample: Rate of env speeches ~ gender 
LM2.1 <- glm(
  cbind(env_speeches, total_speeches - env_speeches) ~
    gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty + DebateTopic + as.factor(year),
  data = AggregatedData,
  family = binomial(),
  control = glm.control(maxit = 100)
)
cluster_vcov2.1 <- vcovCR(LM2.1, cluster = AggregatedData$speakername, type = "CR0")

p_value <- with(LM2.1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0 = bigger than 0.05 = good.

## No values over 10, so no multicollinearity
vif(LM2.1)

## Comparing Models w/ and w/o gender
LM2.1_reduced <- update(LM2.1, . ~ . - gender)
anova(LM2.1_reduced, LM2.1, test = "Chisq")

## Print 
modelsummary(
  list(
    "Default Standard Errors" = LM2,
    "Clustered Standard Errors" = LM2,
    "Default Standard Errors" = LM2.1,
    "Clustered Standard Errors" = LM2.1
  ),
  vcov = list(
    NULL,         # default vcov for first
    cluster_vcov2, # clustered vcov for second
    NULL,
    cluster_vcov2.1
  ),
  exponentiate = TRUE,
  conf_level = 0.95,
  statistic = "std.error",
  stars = TRUE
)

#______________________
## DEBATE MODELS

## Environmental Dummy: Not during Env Debates
DebateModel1<-glm.nb(EnvDummy~gender+YearsExp+gvt_role+opp_role+ EnvCommittee+RulingParty+as.factor(year), data = NotEnvAggregatedData, control = glm.control(maxit = 100))
## Adding Clustered Standard Errors
cluster_vcov3 <- vcovCR(DebateModel1, cluster = NotEnvAggregatedData$speakername, type = "CR0")

p_value <- with(DebateModel1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value ## 0.772

## Comparing Models w/ and w/o gender
DebateModel1_reduced <- update(DebateModel1, . ~ . - gender)
anova(DebateModel1_reduced, DebateModel1, test = "Chisq")

## Environmental Dummy: DURING environmental Debates 
DebateModel2 <- glm.nb(DebateTopic ~ gender  + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty + as.factor(year), data = EnvAggregatedData)

## Adding Clustered Standard Errors
cluster_vcov4 <- vcovCR(DebateModel2, cluster = EnvAggregatedData$speakername, type = "CR2")

p_value <- with(DebateModel2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value ## 1

## Comparing Models w/ and w/o gender
DebateModel2_reduced <- update(DebateModel2, . ~ . - gender)
anova(DebateModel2_reduced, DebateModel2, test = "Chisq")

## Environmental Rate: Not Environmental Debates 
NotEnvReg <- glm(
  cbind(env_speeches, total_speeches - env_speeches) ~
    gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty  + as.factor(year),
  data = NotEnvAggregatedData,
  family = binomial(),
  control = glm.control(maxit = 100)
)
NotEnvcluster_vcov <- vcovCR(NotEnvReg, cluster = NotEnvAggregatedData$speakername, type = "CR0")

p_value <- with(NotEnvReg, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value ## 0

## Comparing Models w/ and w/o gender
NotEnvReg_reduced <- update(NotEnvReg, . ~ . - gender)
anova(NotEnvReg_reduced, NotEnvReg, test = "Chisq")

## Environmental Rate: During Environmental Debates 
EnvReg <- glm(
  cbind(env_speeches, total_speeches - env_speeches) ~
    gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty + as.factor(year),
  data = EnvAggregatedData,
  family = binomial(),
  control = glm.control(maxit = 100)
)
Envcluster_vcov <- vcovCR(EnvReg, cluster = EnvAggregatedData$speakername, type = "CR0")

p_value <- with(EnvReg, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value ## 9.005e-121

## Comparing Models w/ and w/o gender
EnvReg_reduced <- update(EnvReg, . ~ . - gender)
anova(EnvReg_reduced, EnvReg, test = "Chisq")

## Print
modelsummary(
  list(
    "Count: During Debates"= DebateModel2,
    "Count: Not Debates" =DebateModel1,
    "Rates: During Debates" = EnvReg,
    "Rates: Not Debates" = NotEnvReg
  ),
  vcov = list(
    cluster_vcov4,
    cluster_vcov3,
    Envcluster_vcov,         # default vcov for first
    NotEnvcluster_vcov # clustered vcov for second
  ),
  exponentiate = TRUE,
  conf_level = 0.95,
  statistic = "std.error",
  stars = TRUE
)

#_________________
## Party-Specific Regressions

### Conservative party analysis 
Conservative <- subset(AggregatedData, party %in% c("Conservative"))
Conservative$gender <- relevel(Conservative$gender, ref = "M")

## EnvDummy
Cons1<-glm.nb(EnvDummy~gender+YearsExp+gvt_role+ EnvCommittee+DebateTopic, data = Conservative, control = glm.control(maxit = 100))

##  Clustered Standard Errors
cluster_vcov_cons <- vcovCR(Cons1, cluster = Conservative$speakername, type = "CR0")

p_value <- with(Cons1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0.165 = bigger than 0.05 = good.

## Comparing Models w/ and w/o gender
Cons1_reduced <- update(Cons1, . ~ . - gender)
anova(Cons1_reduced, Cons1, test = "Chisq")

## Env speech rate
Cons1.2 <- glm(
  cbind(env_speeches, total_speeches - env_speeches) ~
    gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty + DebateTopic + as.factor(year),
  data = Conservative,
  family = binomial(),
  control = glm.control(maxit = 100)
)
Cons1.2cluster_vcov <- vcovCR(Cons1.2, cluster = Conservative$speakername, type = "CR0")

p_value <- with(Cons1.2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value 

## Comparing Models w/ and w/o gender
Cons1.2_reduced <- update(Cons1.2, . ~ . - gender)
anova(Cons1.2_reduced, Cons1.2, test = "Chisq")


### Labour party analysis 
Labour <- subset(AggregatedData, party %in% c("Labour"))
Labour$gender <- relevel(Labour$gender, ref = "M")

## Env Dummy 
Labour1<-glm.nb(EnvDummy~gender+YearsExp+ opp_role+ EnvCommittee+DebateTopic, data = Labour, control = glm.control(maxit = 100))

## Clustered Standard Errors
cluster_vcov_lab <- vcovCR(Labour1, cluster = Labour$speakername, type = "CR0")

p_value <- with(Labour1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0.244 = bigger than 0.05 = good.

## Comparing Models w/ and w/o gender
Labour1_reduced <- update(Labour1, . ~ . - gender)
anova(Labour1_reduced, Labour1, test = "Chisq")

## Env speech rate
Labour1.2 <- glm(
  cbind(env_speeches, total_speeches - env_speeches) ~
    gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty + DebateTopic + as.factor(year),
  data = Labour,
  family = binomial(),
  control = glm.control(maxit = 100)
)
Labour1.2cluster_vcov <- vcovCR(Labour1.2, cluster = Labour$speakername, type = "CR0")

p_value <- with(Labour1.2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value 

## Comparing Models w/ and w/o gender
Labour1.2_reduced <- update(Labour1.2, . ~ . - gender)
anova(Labour1.2_reduced, Labour1.2, test = "Chisq")

### Lib Dem analysis 
LibDem <- subset(AggregatedData, party %in% c("Liberal Democrat"))
LibDem$gender <- relevel(LibDem$gender, ref = "M")

## Env Dummy 
LibDem1<-glm.nb(EnvDummy~gender+YearsExp+ gvt_role+ opp_role+ EnvCommittee+DebateTopic, data = LibDem, control = glm.control(maxit = 100))

##  Clustered Standard Errors
cluster_vcov_libdem <- vcovCR(LibDem1, cluster = LibDem$speakername, type = "CR0")

p_value <- with(LibDem1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0.089 = bigger than 0.05 = good.

## Comparing Models w/ and w/o gender
LibDem1_reduced <- update(LibDem1, . ~ . - gender)
anova(LibDem1_reduced, LibDem1, test = "Chisq")

## Env speech rate
LibDem1.2 <- glm(
  cbind(env_speeches, total_speeches - env_speeches) ~
    gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty + DebateTopic + as.factor(year),
  data = LibDem,
  family = binomial(),
  control = glm.control(maxit = 100)
)
LibDem1.2cluster_vcov <- vcovCR(LibDem1.2, cluster = LibDem$speakername, type = "CR0")

p_value <- with(LibDem1.2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value

## Comparing Models w/ and w/o gender
LibDem1.2_reduced <- update(LibDem1.2, . ~ . - gender)
anova(LibDem1.2_reduced, LibDem1.2, test = "Chisq")

### SNP analysis 
SNP <- subset(AggregatedData, party %in% c("Scottish National Party"))
SNP$gender <- relevel(SNP$gender, ref = "M")

##Env Dummy
SNP1<-glm.nb(EnvDummy~gender+YearsExp+opp_role+ EnvCommittee+DebateTopic, data = SNP, control = glm.control(maxit = 100))

## Clustered Standard Errors
cluster_vcov_SNP <- vcovCR(SNP1, cluster = SNP$speakername, type = "CR0")

p_value <- with(SNP1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0.078 = bigger than 0.05 = good.

## Comparing Models w/ and w/o gender
SNP1_reduced <- update(SNP1, . ~ . - gender)
anova(SNP1_reduced, SNP1, test = "Chisq")

## Env speech rate 
SNP1.2 <- glm(
  cbind(env_speeches, total_speeches - env_speeches) ~
    gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty + DebateTopic + as.factor(year),
  data = SNP,
  family = binomial(),
  control = glm.control(maxit = 100)
)
SNP1.2cluster_vcov <- vcovCR(SNP1.2, cluster = SNP$speakername, type = "CR0")

p_value <- with(SNP1.2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value 

## Comparing Models w/ and w/o gender
SNP1.2_reduced <- update(SNP1.2, . ~ . - gender)
anova(SNP1.2_reduced, SNP1.2, test = "Chisq")

### Labour (Co-op) analysis
LabCoOp <- subset(AggregatedData, party %in% c("Labour (Co-op)"))
LabCoOp$gender <- relevel(LabCoOp$gender, ref = "M")

## Env Dummy 
LabCoOp1<-glm.nb(EnvDummy~gender+YearsExp+opp_role+EnvCommittee+DebateTopic, data = LabCoOp, control = glm.control(maxit = 100))

## Clustered Standard Errors
cluster_vcov_Labcoop <- vcovCR(LabCoOp1, cluster = LabCoOp$speakername, type = "CR0")

p_value <- with(LabCoOp1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0.125 = bigger than 0.05 = good.

## Comparing Models w/ and w/o gender
LabCoOp1_reduced <- update(LabCoOp1, . ~ . - gender)
anova(LabCoOp1_reduced, LabCoOp1, test = "Chisq")

## Env speech rate
LabCoOp1.2 <- glm(
  cbind(env_speeches, total_speeches - env_speeches) ~
    gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty + DebateTopic + as.factor(year),
  data = LabCoOp,
  family = binomial(),
  control = glm.control(maxit = 100)
)
LabCoOp1.2cluster_vcov <- vcovCR(LabCoOp1.2, cluster = LabCoOp$speakername, type = "CR0")

p_value <- with(LabCoOp1.2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value 

## Comparing Models w/ and w/o gender
LabCoOp1.2_reduced <- update(LabCoOp1.2, . ~ . - gender)
anova(LabCoOp1.2_reduced, LabCoOp1.2, test = "Chisq")

### Independent party analysis Independent 
Independent <- subset(AggregatedData, party %in% c("Independent"))
Independent$gender <- relevel(Independent$gender, ref = "M")

## Env Dummy 
Independent1<-glm.nb(EnvDummy~gender+YearsExp+ gvt_role+opp_role+EnvCommittee+DebateTopic, data = Independent, control = glm.control(maxit = 100))

## Clustered Standard Errors
cluster_vcov_ind <- vcovCR(Independent1, cluster = Independent$speakername, type = "CR0")

p_value <- with(Independent1, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0.521 = bigger than 0.05 = good.

## Comparing Models w/ and w/o gender
Independent1_reduced <- update(Independent1, . ~ . - gender)
anova(Independent1_reduced, Independent1, test = "Chisq")

## Env speech rates 
Independent1.2 <- glm(
  cbind(env_speeches, total_speeches - env_speeches) ~
    gender + YearsExp + gvt_role + opp_role + EnvCommittee + RulingParty + DebateTopic + as.factor(year),
  data = Independent,
  family = binomial(),
  control = glm.control(maxit = 100)
)
Independent1.2cluster_vcov <- vcovCR(Independent1.2, cluster = Independent$speakername, type = "CR0")

p_value <- with(Independent1.2, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value

## Comparing Models w/ and w/o gender
Independent1.2_reduced <- update(Independent1.2, . ~ . - gender)
anova(Independent1.2_reduced, Independent1.2, test = "Chisq")


### Printing all party models with clustered standard errors 

## Env Dummy 
modelsummary(
  list(
    "Conservative" = Cons1,
    "Labour" = Labour1,
    "Lib Dem" = LibDem1,
    "SNP" = SNP1,
    "Lab (Co-op)" = LabCoOp1,
    "Independent" = Independent1
  ),
  vcov = list(
    cluster_vcov_cons,
    cluster_vcov_lab,
    cluster_vcov_libdem,
    cluster_vcov_SNP,
    cluster_vcov_Labcoop,
    cluster_vcov_ind
  ),
  exponentiate = TRUE,
  conf_level = 0.95,
  statistic = "std.error",
  stars = TRUE
)

## Env speech rates 
modelsummary(
  list(
    "Conservative" = Cons1.2,
    "Labour" = Labour1.2,
    "Lib Dem" = LibDem1.2,
    "SNP" = SNP1.2,
    "Lab (Co-op)" = LabCoOp1.2,
    "Independent" = Independent1.2
  ),
  vcov = list(
    Cons1.2cluster_vcov,
    Labour1.2cluster_vcov,
    LibDem1.2cluster_vcov,
    SNP1.2cluster_vcov,
    LabCoOp1.2cluster_vcov,
    Independent1.2cluster_vcov
  ),
  exponentiate = TRUE,
  conf_level = 0.95,
  statistic = "std.error",
  stars = TRUE
)

### Party interaction model

AggregatedData$gender <- relevel(AggregatedData$gender, ref = "M")
AggregatedData$party<-as.factor(AggregatedData$party)
AggregatedData_subset <- AggregatedData %>%
  filter(party %in% c(
    "Conservative",
    "Labour",
    "Liberal Democrat",
    "Scottish National Party",
    "Labour (Co-op)",
    "Independent"
  ))
library(MASS)
PartyInteraction<-glm.nb(EnvDummy~gender*party+YearsExp+gvt_role+opp_role+ EnvCommittee+DebateTopic, data = AggregatedData_subset, control = glm.control(maxit = 100))
summary(PartyInteraction)

library(stargazer)
library(ggeffects)
stargazer(PartyInteraction, type = "html", out = "regression_table.html")

preds <- ggpredict(PartyInteraction, terms = c("party", "gender"))

# Plot using ggplot

plot(preds) + 
  aes(shape = group, color = NULL) +  # Use shapes instead of color
  scale_shape_manual(values = c(16, 17)) +  # Optional: custom shapes for gender
  scale_color_manual(values = c("black", "black"), guide = "none") +
  labs(
    title = "Predicted Counts of Environmental Speeches",
    x = "Party", 
    y = "Predicted Probability of Environmental Speeches",
    shape = "Gender"  # This controls the legend title
  ) +
  theme_minimal()


##___________________________-

#Appendix: 

## Parties in dataset with n speakers
table <- AggregatedData %>% ## checking what parties are in the dataset 
  group_by(party) %>%
  summarise(n_speakers = n_distinct(speakername)) %>%
  arrange(desc(n_speakers))
print(table)

## Party fixed effects model--> appendix model 1
PartyFixedEffects<-glm.nb(EnvDummy~gender+YearsExp+gvt_role+opp_role+ EnvCommittee+DebateTopic+as.factor(party), data = AggregatedData, control = glm.control(maxit = 100))

## For exponentiated results: tidy() doesn't work so done manually here:
coefs <- summary(PartyFixedEffects)$coefficients
exp_estimates <- exp(coefs[, 1])
confint_manual <- exp(confint.default(PartyFixedEffects))

exp_df <- data.frame(
  Term = rownames(coefs),
  Estimate = exp_estimates,
  Conf.Low = confint_manual[, 1],
  Conf.High = confint_manual[, 2],
  p.value = coefs[, 4]
)
exp_df

p_value <- with(PartyFixedEffects, pchisq(deviance, df.residual, lower.tail = FALSE))
p_value #0.058 = bigger than 0.05 = good.


### ADD: test whether zero inflated negative binomial is necessary using MASS package
## Zero inflated negative binomial 
library(MASS)
# Fit NB model first
nb_model <- glm.nb(EnvDummy~gender+YearsExp+gvt_role+opp_role+ EnvCommittee+RulingParty+DebateTopic, data = AggregatedData, control = glm.control(maxit = 100))

# Predict zero probabilities from NB
predicted_zeros <- dnbinom(0, size = nb_model$theta, mu = predict(nb_model, type = "response"))

# Compare observed proportion of zeros to predicted
observed_zeros <- mean(AggregatedData$EnvDummy == 0)
cat("Observed zeros:", observed_zeros, "\nExpected zeros under NB:", mean(predicted_zeros))



## Gender Breakdown of Party Membership, Appendix table 10
Data <- Data %>%
  mutate(observation_number = row_number())

count_gen_f2 <- Data %>%
  filter(gender == "F") %>%
  filter(party== "Conservative")%>%
  filter(EnvDummy =="1")%>%
  distinct(observation_number) %>%
  nrow()

count_gen_m2 <- Data %>%
  filter(gender == "M") %>%
  filter(party== "Conservative")%>%
  filter(EnvDummy =="1")%>%
  distinct(observation_number) %>%
  nrow()

Men_never<-Data %>%
  group_by(speakername) %>%
  filter(gender == "M") %>%
  summarise(has_env = any(EnvDummy == 1)) %>%
  filter(has_env == FALSE) %>%
  summarise(count = n())
print(Men_never)

Women_never<-Data %>%
  group_by(speakername) %>%
  filter(gender == "F") %>%
  summarise(has_env = any(EnvDummy == 1)) %>%
  filter(has_env == FALSE) %>%
  summarise(count = n())
print(Women_never)


print(paste("Conservative Women: Environmental Speeches", count_gen_f2))
print(paste("Conservative Men: Environmental Speeches:", count_gen_m2))


Data <- Data %>%
  mutate(observation_number = row_number())

count_gen_f2 <- Data %>%
  filter(gender == "F") %>%
  filter(party== "Conservative")%>%
  filter(EnvDummy =="1")%>%
  distinct(observation_number) %>%
  nrow()
count_gen_m2 <- Data %>%
  filter(gender == "M") %>%
  filter(party== "Conservative")%>%
  filter(EnvDummy =="1")%>%
  distinct(observation_number) %>%
  nrow()
print(paste("Conservative Women: Environmental Speeches", count_gen_f2))
print(paste("Conservative Men: Environmental Speeches:", count_gen_m2))


## Men vs women speakers 
speakers<- unique(Data$speakername)
length(speakers)

count_gen_f <- Data %>%
  filter(gender == "F") %>%
  filter(party == "Conservative") %>%
  distinct(speakername) %>%
  nrow()
count_gen_m <- Data %>%
  filter(gender == "M") %>%
  filter(party == "Conservative") %>%
  distinct(speakername) %>%
  nrow()
count_total <- Data %>%
  filter(party == "Conservative") %>%
  distinct(speakername) %>%
  nrow()

count_Consspeech <- Data %>%
  filter(EnvDummy == "1") %>%
  filter(party == "Conservative") %>%
  distinct(observation_number) %>%
  nrow()

print(paste("Women Speakers", count_gen_f))
print(paste("Men Speakers", count_gen_m))
print(paste("All Speakers", count_total))
print(paste("Total Conservative environmental speeches", count_Consspeech))

indcount_gen_f <- Data %>%
  filter(gender == "F") %>%
  filter(party == "Independent") %>%
  distinct(speakername) %>%
  nrow()
indcount_gen_m <- Data %>%
  filter(gender == "M") %>%
  filter(party == "Independent") %>%
  distinct(speakername) %>%
  nrow()

indcount_total <- Data %>%
  filter(party == "Independent") %>%
  distinct(speakername) %>%
  nrow()

print(paste("Women Speakers", indcount_gen_f))
print(paste("Men Speakers", indcount_gen_m))
print(paste("All Speakers", indcount_total))

##Labour
count_gen_f <- Data %>%
  filter(gender == "F") %>%
  filter(party == "Labour") %>%
  distinct(speakername) %>%
  nrow()
count_gen_m <- Data %>%
  filter(gender == "M") %>%
  filter(party == "Labour") %>%
  distinct(speakername) %>%
  nrow()
count_total <- Data %>%
  filter(party == "Labour") %>%
  distinct(speakername) %>%
  nrow()

print(paste("Women Speakers", count_gen_f))
print(paste("Men Speakers", count_gen_m))
print(paste("All Speakers", count_total))

## Lib Dem 
count_gen_f <- Data %>%
  filter(gender == "F") %>%
  filter(party == "Liberal Democrat") %>%
  distinct(speakername) %>%
  nrow()
count_gen_m <- Data %>%
  filter(gender == "M") %>%
  filter(party == "Liberal Democrat") %>%
  distinct(speakername) %>%
  nrow()
count_total <- Data %>%
  filter(party == "Liberal Democrat") %>%
  distinct(speakername) %>%
  nrow()

print(paste("Women Speakers", count_gen_f))
print(paste("Men Speakers", count_gen_m))
print(paste("All Speakers", count_total))

## SNP
count_gen_f <- Data %>%
  filter(gender == "F") %>%
  filter(party == "Scottish National Party") %>%
  distinct(speakername) %>%
  nrow()
count_gen_m <- Data %>%
  filter(gender == "M") %>%
  filter(party == "Scottish National Party") %>%
  distinct(speakername) %>%
  nrow()
count_total <- Data %>%
  filter(party == "Scottish National Party") %>%
  distinct(speakername) %>%
  nrow()

print(paste("Women Speakers", count_gen_f))
print(paste("Men Speakers", count_gen_m))
print(paste("All Speakers", count_total))

### Lab co op
count_gen_f <- Data %>%
  filter(gender == "F") %>%
  filter(party == "Labour (Co-op)") %>%
  distinct(speakername) %>%
  nrow()
count_gen_m <- Data %>%
  filter(gender == "M") %>%
  filter(party == "Labour (Co-op)") %>%
  distinct(speakername) %>%
  nrow()
count_total <- Data %>%
  filter(party == "Labour (Co-op)") %>%
  distinct(speakername) %>%
  nrow()

print(paste("Women Speakers", count_gen_f))
print(paste("Men Speakers", count_gen_m))
print(paste("All Speakers", count_total))

## Independent 
count_gen_f <- Data %>%
  filter(gender == "F") %>%
  filter(party == "Independent") %>%
  distinct(speakername) %>%
  nrow()
count_gen_m <- Data %>%
  filter(gender == "M") %>%
  filter(party == "Independent") %>%
  distinct(speakername) %>%
  nrow()
count_total <- Data %>%
  filter(party == "Independent") %>%
  distinct(speakername) %>%
  nrow()

print(paste("Women Speakers", count_gen_f))
print(paste("Men Speakers", count_gen_m))
print(paste("All Speakers", count_total))
